# list out file names of tumour and immune imagestumour =list.files('100/CD4+_T_Cells', full.names =TRUE) |>sample()immune =list.files('100/Invasive_Tumor', full.names =TRUE) |>sample()# exclude the uninformative images (i.e. the blank ones)tumour = tumour[!str_detect(tumour, '10.png')]immune = immune[!str_detect(immune, '10.png')]# apply readImage function to get the data from the imagetumour_imgs =sapply(tumour, EBImage::readImage, simplify =FALSE)immune_imgs =sapply(immune, EBImage::readImage, simplify =FALSE)# resize the images so that they have the same height and widthtumour_imgs = tumour_imgs |>lapply(resize, w =100, h =100)immune_imgs = immune_imgs |>lapply(resize, w =100, h =100)# combine the images into 1 listX =c(tumour_imgs, immune_imgs)# create a matrix to store the information of the imagesXmat =abind(lapply(X, function(x) x@.Data), along =0)# label the samplesy =c(rep('tumour', 1000), rep('immune', 976)) |>as.factor()
# function to compute color histograms for each channel (r, g, b)get_colour_histogram <-function(img, bins =16) { # later: check for 32 or 64 bins# separate the rgb channels and calculate the histograms for each channel r_hist <-hist(img[,,1], breaks =seq(0, 1, length.out = bins +1), plot =FALSE) g_hist <-hist(img[,,2], breaks =seq(0, 1, length.out = bins +1), plot =FALSE) b_hist <-hist(img[,,3], breaks =seq(0, 1, length.out = bins +1), plot =FALSE)# combine the histograms into a single vector (flattened) colour_histogram <-c(r_hist$counts/sum(r_hist$counts), g_hist$counts/sum(g_hist$counts), b_hist$counts/sum(b_hist$counts))return(colour_histogram)}hoc_ls =c()# loop through all images in the datasetfor (i in1:dim(Xmat)[1]) { img = Xmat[i,,,] # select the i-th image col_hist =get_colour_histogram(img) hoc_ls[[i]] = col_hist}# convert hoc_ls into a matrixXhoc =do.call(rbind, hoc_ls)Xhoc =remove_constant(Xhoc)
Code
set.seed(3888)# randomly choose 20% of the tumour and immune samples as a test settest_id =c(sample(c(1:1000), size =200, replace =FALSE),sample(c(1001:1976), size =196, replace =FALSE))# training labelsy_train = y[-test_id]y_test = y[test_id]
# fit & draw out the learning curve of the cnn modelyMat_train =model.matrix(~y_train-1)hist = model |>fit(x = Xmat_train, y = yMat_train,batch_size =16,steps_per_epoch =ceiling(nrow(Xmat_train)/16),validation_data =list(Xmat_train, yMat_train), epochs =20,validation_split =0.1,verbose =0)(plot(hist) +theme_classic()) |>plotly_build()
# set train control to fine tune the hyperparameterstrain_control <-trainControl(method ="cv", number =5)# fine tune hyperparameterstune_grid =expand.grid(k =c(1:10))# train the knn modelknn_hog_model =train(y ~ ., data = hog_train, method ="knn", trControl = train_control, tuneGrid = tune_grid)knn_hog_model$finalModel
1-nearest neighbor model
Training set outcome distribution:
immune tumour
780 800
# set up train control for rftrain_control_rf <-trainControl(method ="cv", number =5, search ="grid")# define a grid of hyperparameters to tunetune_grid =expand.grid(mtry =c(3, 7, 10))# train rf using cross-validationrf_hog_model =train(y ~ ., data = hog_train, method ="rf", trControl = train_control, tuneGrid = tune_grid)rf_hog_model$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 10
OOB estimate of error rate: 14.87%
Confusion matrix:
immune tumour class.error
immune 697 83 0.1064103
tumour 152 648 0.1900000
Confusion Matrix and Statistics
Reference
Prediction immune tumour
immune 178 28
tumour 18 172
Accuracy : 0.8838
95% CI : (0.8481, 0.9137)
No Information Rate : 0.5051
P-Value [Acc > NIR] : <2e-16
Kappa : 0.7678
Mcnemar's Test P-Value : 0.1845
Sensitivity : 0.8600
Specificity : 0.9082
Pos Pred Value : 0.9053
Neg Pred Value : 0.8641
Prevalence : 0.5051
Detection Rate : 0.4343
Detection Prevalence : 0.4798
Balanced Accuracy : 0.8841
'Positive' Class : tumour
Model information
Code
# set up train controltrain_control <-trainControl(method ="cv", number =5)# define hyperparameter gridtune_grid <-expand.grid(C =c(1, 10, 100),sigma =c(0.01, 0.05, 0.1))# train the modelsvm_hog_model <-train( y ~ ., data = hog_train, method ="svmRadial", trControl = train_control, tuneGrid = tune_grid)svm_hog_model$finalModel
Support Vector Machine object of class "ksvm"
SV type: C-svc (classification)
parameter : cost C = 10
Gaussian Radial Basis kernel function.
Hyperparameter : sigma = 0.01
Number of Support Vectors : 1504
Objective Function Value : -547.3027
Training error : 0
Model test
Code
# test modelsvm_hog_pred =predict(svm_hog_model, hog_test)svm_hog_confMat =confusionMatrix(svm_hog_pred, y_test, positive ='tumour')svm_hog_confMat
Confusion Matrix and Statistics
Reference
Prediction immune tumour
immune 153 25
tumour 43 175
Accuracy : 0.8283
95% CI : (0.7875, 0.8641)
No Information Rate : 0.5051
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.6562
Mcnemar's Test P-Value : 0.03925
Sensitivity : 0.8750
Specificity : 0.7806
Pos Pred Value : 0.8028
Neg Pred Value : 0.8596
Prevalence : 0.5051
Detection Rate : 0.4419
Detection Prevalence : 0.5505
Balanced Accuracy : 0.8278
'Positive' Class : tumour
4. Models built on HOC
Code
# hog training setXhoc_train = Xhoc[-test_id,]# hog testing setXhoc_test = Xhoc[test_id,]# train sethoc_train =as.data.frame(Xhoc_train)hoc_train$y = y_train# test sethoc_test =as.data.frame(Xhoc_test)
# set train control to fine tune the hyperparameterstrain_control <-trainControl(method ="cv", number =5)# fine tune hyperparameterstune_grid =expand.grid(k =c(1:10))# train the knn modelknn_hoc_model =train(y ~ ., data = hoc_train, method ="knn", trControl = train_control, tuneGrid = tune_grid)knn_hoc_model$finalModel
1-nearest neighbor model
Training set outcome distribution:
immune tumour
780 800
# set up train control for rftrain_control_rf <-trainControl(method ="cv", number =5, search ="grid")# define a grid of hyperparameters to tunetune_grid =expand.grid(mtry =c(3, 7, 10))# train rf using cross-validationrf_hoc_model =train(y ~ ., data = hoc_train, method ="rf", trControl = train_control, tuneGrid = tune_grid)rf_hoc_model$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 7
OOB estimate of error rate: 5.89%
Confusion matrix:
immune tumour class.error
immune 740 40 0.05128205
tumour 53 747 0.06625000
# set up train controltrain_control <-trainControl(method ="cv", number =5)# define hyperparameter gridtune_grid <-expand.grid(C =c(1, 10, 100),sigma =c(0.01, 0.05, 0.1))# train the modelsvm_hoc_model <-train( y ~ ., data = hoc_train, method ="svmRadial", trControl = train_control, tuneGrid = tune_grid)svm_hoc_model$finalModel
Support Vector Machine object of class "ksvm"
SV type: C-svc (classification)
parameter : cost C = 100
Gaussian Radial Basis kernel function.
Hyperparameter : sigma = 0.1
Number of Support Vectors : 463
Objective Function Value : -981.667
Training error : 0
Code
# test modelsvm_hoc_pred =predict(svm_hoc_model, hoc_test)svm_hoc_confMat =confusionMatrix(svm_hoc_pred, y_test, positive ='tumour')svm_hoc_confMat
Confusion Matrix and Statistics
Reference
Prediction immune tumour
immune 191 3
tumour 5 197
Accuracy : 0.9798
95% CI : (0.9606, 0.9912)
No Information Rate : 0.5051
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9596
Mcnemar's Test P-Value : 0.7237
Sensitivity : 0.9850
Specificity : 0.9745
Pos Pred Value : 0.9752
Neg Pred Value : 0.9845
Prevalence : 0.5051
Detection Rate : 0.4975
Detection Prevalence : 0.5101
Balanced Accuracy : 0.9797
'Positive' Class : tumour